library(ISLR)
library(dplyr)
library(readr)
library(broom)
library(ggplot2)
library(tidymodels)
library(stringr)
library(splitstackshape)
library(lubridate)
library(rpart.plot)
library(cluster)
library(forcats)
tidymodels_prefer()
library(probably) #install.packages('probably')
library(vip)
imdb_top_1000 <- read_csv("~/Desktop/Statistical Machine Learning/R Files/Final Project/imdb_top_1000_CLEAN.csv")
imdb_clean <- imdb_top_1000 %>%
cSplit("Genre", sep = ",", direction = "wide") %>%
mutate(Gross = log(Revenue-Budget)) %>%
select(-...15)
runtime_clean <- imdb_top_1000$Runtime %>%
str_replace(" min", "") %>%
as.numeric()
imdb_clean$Runtime <- runtime_clean
imdb_clean <- imdb_clean %>%
filter(Gross != "-Inf") %>%
drop_na(Gross, Budget)
head(imdb_clean)
data_cv10 <- vfold_cv(imdb_clean, v = 10)
# Model Spec
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
# Recipe
full_lm_rec <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_naomit(Gross, Runtime, IMDB_Rating, Meta_score, No_of_Votes)
# Workflow
full_lm_wf <- workflow() %>%
add_recipe(full_lm_rec) %>%
add_model(lm_spec)
full_lm_model <- fit(full_lm_wf, data = imdb_clean)
full_lm_model %>% tidy()
full_lm_modelcv <- fit_resamples(full_lm_wf, resamples = data_cv10, metrics = metric_set(rmse, rsq, mae))
full_lm_modelcv %>%
collect_metrics()
# Lasso Model Spec with tune
lm_lasso_spec_tune <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% # mixture = 1 indicates Lasso
set_engine(engine = 'glmnet') %>%
set_mode('regression')
# Recipe
data_rec_lasso <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value (don't want duplicates)
step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables
step_normalize(all_numeric_predictors()) %>% # standardization important step for LASSO
step_dummy(all_nominal_predictors()) %>% # creates indicator variables for categorical variables
step_naomit(Gross, Runtime, IMDB_Rating, # omit any NA values
Meta_score, No_of_Votes)
# Workflow
lasso_wf_tune <- workflow() %>%
add_recipe(data_rec_lasso) %>%
add_model(lm_lasso_spec_tune)
# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
penalty(range = c(-3, 1)),
levels = 30)
tune_res <- tune_grid(
lasso_wf_tune,
resamples = data_cv10,
metrics = metric_set(rmse, mae),
grid = penalty_grid
)
# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_res) + theme_classic()
# Collect CV Metrics and Select Best Model
# Summarize Model Evaluation Metrics (CV)
lasso_mod <- collect_metrics(tune_res) %>%
filter(.metric == 'rmse') %>%
select(penalty, rmse = mean)
# Choose penalty value
best_penalty <- select_best(tune_res, metric = 'rmse')
lasso_mod
# Fit Final Model
final_wf <- finalize_workflow(lasso_wf_tune, best_penalty) # incorporates penalty value to workflow
final_fit <- fit(final_wf, data = imdb_clean)
lasso_fit <- fit_resamples(final_wf, resamples = data_cv10, metrics = metric_set(rmse, rsq, mae))
tidy(final_fit)
# Final ("best") model predictors and coefficients
final_fit %>% tidy() %>% filter(estimate != 0)
lasso_fit %>%
collect_metrics()
lasso_mod_out <- final_fit %>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
ggplot(lasso_mod_out, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(lasso_mod_out, aes(x = Runtime, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(lasso_mod_out, aes(x = IMDB_Rating, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(lasso_mod_out, aes(x = No_of_Votes, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Build the GAM
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
gam_mod1 <- fit(gam_spec,
Gross ~ s(Runtime) + s(IMDB_Rating) + Meta_score + s(No_of_Votes) + Genre_1,
data = imdb_clean
)
# Diagnostics: Check to see if the number of knots is large enough (if p-value is low, increase number of knots)
gam_mod1 %>% pluck('fit') %>% mgcv::gam.check()
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 4 iterations.
## The RMS GCV score gradient at convergence was 1.046117e-05 .
## The Hessian was positive definite.
## Model rank = 41 / 41
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Runtime) 9.00 2.27 0.92 0.04 *
## s(IMDB_Rating) 9.00 4.08 0.97 0.21
## s(No_of_Votes) 9.00 4.30 1.06 0.90
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnostics: Check to see if the number of knots is large enough
gam_mod1 %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Gross ~ s(Runtime) + s(IMDB_Rating) + Meta_score + s(No_of_Votes) +
## Genre_1
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.981283 0.430198 39.473 < 2e-16 ***
## Meta_score 0.009396 0.005328 1.763 0.078414 .
## Genre_1Adventure -0.154761 0.247761 -0.625 0.532484
## Genre_1Animation 1.157754 0.299124 3.870 0.000123 ***
## Genre_1Biography 0.134568 0.236486 0.569 0.569583
## Genre_1Comedy 0.104267 0.226320 0.461 0.645206
## Genre_1Crime -0.671359 0.234979 -2.857 0.004448 **
## Genre_1Drama -0.390133 0.191382 -2.039 0.042010 *
## Genre_1Family -0.509639 0.975559 -0.522 0.601611
## Genre_1Film-Noir -2.833038 0.985900 -2.874 0.004226 **
## Genre_1Horror 0.406753 0.508980 0.799 0.424570
## Genre_1Mystery -1.162116 0.580799 -2.001 0.045928 *
## Genre_1Thriller -0.220396 1.373228 -0.160 0.872554
## Genre_1Western -0.527771 0.821316 -0.643 0.520775
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Runtime) 2.270 2.909 20.14 <2e-16 ***
## s(IMDB_Rating) 4.076 5.024 17.84 <2e-16 ***
## s(No_of_Votes) 4.301 5.314 60.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.504 Deviance explained = 52.5%
## GCV = 1.9256 Scale est. = 1.8377 n = 540
# Visualize: Look at the estimated non-linear functions
gam_mod1 %>% pluck('fit') %>% plot()
gam1_output <- gam_mod1%>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
gam1_output %>%
rmse(truth = Gross, estimate = .pred)
gam1_output %>%
rsq(truth = Gross, estimate = .pred)
spline_rec <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_naomit(Gross) %>%
step_ns(Runtime, deg_free = 2) %>%
step_ns(No_of_Votes, deg_free = 6) %>%
step_ns(IMDB_Rating, deg_free = 2)
spline_rec %>% prep(imdb_clean) %>% juice()
# Build the GAM
lm_spec_gam <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
spline_wf <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(spline_rec)
cv_output_spline2 <- fit_resamples(
spline_wf, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(mae,rmse,rsq)
)
cv_output_spline2 %>% collect_metrics()
new_mod <- spline_wf %>%
fit(data = imdb_clean)
new_mod %>%
tidy()
new_mod %>%
tidy() %>%
arrange(desc(abs(statistic)))
new_modcv <- fit_resamples(spline_wf, resamples = data_cv10, metrics = metric_set(mae, rmse, rsq))
# Lasso Model Spec with tune
gam_lasso_spec_tune <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% ## mixture = 1 indicates Lasso
set_engine(engine = 'glmnet') %>%
set_mode('regression')
# Recipe with standardization (!) --> just include all these always
data_rec <- recipe(Gross ~ Runtime + IMDB_Rating + Meta_score +
No_of_Votes + Genre_1, data = imdb_clean) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value (so duplicates don't mess up model)
step_novel(all_nominal_predictors()) %>% # important if you have rare categorical variables
step_normalize(all_numeric_predictors()) %>% # super important standardization step for LASSO
step_dummy(all_nominal_predictors()) %>% # creates indicator variables for categorical variables
step_naomit(Gross)
# Workflow (Recipe + Model)
lasso_wf_tune1 <- workflow() %>%
add_recipe(data_rec) %>%
add_model(gam_lasso_spec_tune)
# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
penalty(range = c(-3, 1)),
levels = 30)
tune_res1 <- tune_grid( # new function for tuning parameters
lasso_wf_tune1, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(rmse, mae),
grid = penalty_grid # penalty grid defined above
)
# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_res1) + theme_classic()
# Summarize Model Evaluation Metrics (CV)
collect_metrics(tune_res) %>%
filter(.metric == 'rmse') %>% # or choose mae
select(penalty, rmse = mean)
best_penalty1 <- select_best(tune_res1, metric = 'rmse') # choose penalty value based on lowest mae or rmse
# Fit Final Model
final_wf_gam_lasso <- finalize_workflow(lasso_wf_tune1, best_penalty) # incorporates penalty value to workflow
final_fit_gam_lasso <- fit(final_wf_gam_lasso, data = imdb_clean)
tidy(final_fit_gam_lasso)
# Final ("best") model predictors and coefficients
final_fit_gam_lasso %>% tidy() %>% filter(estimate != 0)
gam_lasso_output <- fit_resamples(
final_wf_gam_lasso, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(mae,rmse,rsq)
)
gam_lasso_output %>% collect_metrics()
gam1_output %>%
rmse(truth = Gross, estimate = .pred)
cv_output_spline2 %>% collect_metrics()
gam_lasso_output %>% collect_metrics()
# Visualize Residuals
gam1_output <- new_mod %>%
predict(new_data = imdb_clean) %>%
bind_cols(imdb_clean) %>%
mutate(resid = Gross - .pred)
ggplot(gam1_output, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(gam1_output, aes(x = Runtime, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(gam1_output, aes(x = IMDB_Rating, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
ggplot(gam1_output, aes(x = No_of_Votes, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
full_lm_modelcv %>% collect_metrics() #OLS
lasso_fit %>% collect_metrics() #LASSO
gam1_output %>%
rmse(truth = Gross, estimate = .pred) #GAM
imdb_clean_class <- imdb_clean %>%
mutate(success_ratio = Revenue/Budget) %>%
mutate(flop = as.factor(ifelse(success_ratio > 2, 'FALSE', 'TRUE'))) %>%
drop_na(flop, No_of_Votes,Runtime, IMDB_Rating,Meta_score,Genre_1)
# Model Specification
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(trees = 1000, # Number of trees
min_n = NULL,
probability = FALSE,
importance = 'impurity') %>%
set_mode('classification')
# Recipe
data_rec <- recipe(flop ~ No_of_Votes + Runtime + IMDB_Rating + Meta_score + Genre_1,
data = imdb_clean_class) %>%
step_naomit(flop, No_of_Votes, Runtime, IMDB_Rating, Meta_score, Genre_1)
# Create Workflows
data_wf_mtry3 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 3)) %>%
add_recipe(data_rec)
data_wf_mtry4 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 4)) %>%
add_recipe(data_rec)
data_wf_mtry5 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 5)) %>%
add_recipe(data_rec)
# Fit Models
set.seed(123) # make sure to run this before each fit so that you have the same 1000 trees
data_fit_mtry3 <- fit(data_wf_mtry3, data = imdb_clean_class)
set.seed(123)
data_fit_mtry4 <- fit(data_wf_mtry4, data = imdb_clean_class)
set.seed(123)
data_fit_mtry5 <- fit(data_wf_mtry5, data = imdb_clean_class)
# Custom Function to get OOB predictions, true observed outcomes and add a user-provided model label
rf_OOB_output <- function(fit_model, model_label, truth){
tibble(
.pred_class = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
flop = truth,
model = model_label
)
}
# Evaluate OOB Metrics
data_rf_OOB_output <- bind_rows(
rf_OOB_output(data_fit_mtry3,3, imdb_clean_class %>% pull(flop)),
rf_OOB_output(data_fit_mtry4,4, imdb_clean_class %>% pull(flop)),
rf_OOB_output(data_fit_mtry5,5, imdb_clean_class %>% pull(flop))
)
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = flop, estimate = .pred_class)
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = flop, estimate = .pred_class) %>%
mutate(mtry = as.numeric(stringr::str_replace(model,'mtry',''))) %>%
ggplot(aes(x = mtry, y = .estimate )) +
geom_point() +
geom_line() +
theme_classic()
rf_OOB_output(data_fit_mtry4,4, imdb_clean_class %>% pull(flop)) %>%
conf_mat(truth = flop, estimate= .pred_class)
## Truth
## Prediction FALSE TRUE
## FALSE 468 65
## TRUE 7 0
# Impurity
model_output <-data_fit_mtry4 %>%
extract_fit_engine()
model_output %>%
vip(num_features = 10) + theme_classic() #based on impurity, 10 meaning the top 10
model_output %>% vip::vi() %>% head()
model_output %>% vip::vi() %>% tail()
# Permutation
model_output2 <- data_wf_mtry4 %>%
update_model(rf_spec %>% set_args(importance = "permutation")) %>% #based on permutation
fit(data = imdb_clean_class) %>%
extract_fit_engine()
model_output2 %>%
vip(num_features = 10) + theme_classic()
model_output2 %>% vip::vi() %>% head()
model_output2 %>% vip::vi() %>% tail()
ggplot(imdb_clean_class, aes(x = flop, y = No_of_Votes)) +
geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = Runtime)) +
geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = IMDB_Rating)) +
geom_violin() + theme_classic()
ggplot(imdb_clean_class, aes(x = flop, y = Meta_score)) +
geom_violin() + theme_classic()
set.seed(123)
# Logistic Regression Model Spec
logistic_spec <- logistic_reg() %>%
set_engine('glm') %>%
set_mode('classification')
# Recipe
logistic_rec <- recipe(flop ~ No_of_Votes + Runtime + IMDB_Rating + Genre_1,
data = imdb_clean_class)
# Workflow (Recipe + Model) for Full Log Model
log_wf <- workflow() %>%
add_recipe(logistic_rec) %>%
add_model(logistic_spec)
# Fit Model
log_fit <- fit(log_wf, data = imdb_clean_class)
tidy(log_fit)
log_fit %>% tidy() %>%
mutate(OR = exp(estimate))
Ask Bryan about relevel factor… do we need it?
# Creation of CV Folds
data_cv10_class <- vfold_cv(imdb_clean_class, v = 10)
full_log_wf <- workflow() %>%
add_recipe(logistic_rec) %>%
add_model(logistic_spec)
log_model <- fit(full_log_wf, data = imdb_clean_class)
log_modelcv <- fit_resamples(full_log_wf, resamples = data_cv10_class, metrics = metric_set(accuracy,sens,yardstick::spec))
log_modelcv %>%
collect_metrics()
# Soft Predictions on Training Data
final_output <- log_fit %>% predict(new_data = imdb_clean_class, type='prob') %>% bind_cols(imdb_clean_class)
final_output %>%
ggplot(aes(x = flop, y = .pred_TRUE)) +
geom_boxplot()
# Use soft predictions
final_output %>%
roc_curve(flop,.pred_TRUE,event_level = 'second') %>%
autoplot()
# thresholds in terms of reference level
threshold_output <- final_output %>%
threshold_perf(truth = flop, estimate = .pred_FALSE, thresholds = seq(0,1,by=.01))
# J-index v. threshold for no flop
threshold_output %>%
filter(.metric == 'j_index') %>%
ggplot(aes(x = .threshold, y = .estimate)) +
geom_line() +
labs(y = 'J-index', x = 'threshold') +
theme_classic()
threshold_output %>%
filter(.metric == 'j_index') %>%
arrange(desc(.estimate))
# Distance v. threshold for not_spam
threshold_output %>%
filter(.metric == 'distance') %>%
ggplot(aes(x = .threshold, y = .estimate)) +
geom_line() +
labs(y = 'Distance', x = 'threshold') +
theme_classic()
threshold_output %>%
filter(.metric == 'distance') %>%
arrange(.estimate)
log_metrics <- metric_set(accuracy,sens,yardstick::spec)
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = .12)) %>%
log_metrics(truth = flop, estimate = .pred_class, event_level = 'second')
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_FALSE, levels(flop), threshold = .12)) %>%
conf_mat(truth = flop, estimate = .pred_class)
## Truth
## Prediction FALSE TRUE
## FALSE 475 65
## TRUE 0 0
predict(log_fit, new_data = data.frame(No_of_Votes = 10000, Runtime = 112, IMDB_Rating = 9.8,
Genre_1 = "Drama"), type = "prob"
)
predict(log_fit, new_data = data.frame(No_of_Votes = 10000, Runtime = 112, IMDB_Rating = 9.8,
Genre_1 = "Drama"), type = "class"
)
Must manually calculate hard prediction
ggplot(imdb_clean, aes(x = Budget, y = Runtime)) +
geom_point() + theme_classic()
ggplot(imdb_clean, aes(x = Budget, y = Gross)) +
geom_point() + theme_classic()
ggplot(imdb_clean, aes(x = No_of_Votes, y = Runtime)) +
geom_point() + theme_classic()
ggplot(imdb_clean, aes(x = Gross, y = No_of_Votes)) +
geom_point() + theme_classic()
# Select just the bill length and depth variables
imdb_sub <- imdb_clean %>%
select(Budget, Runtime)
# Run k-means for k = centers = 3
set.seed(253)
kclust_k10 <- kmeans(scale(imdb_sub), centers = 10)
# Display the cluter assignments
kclust_k10$cluster
## [1] 7 9 3 10 5 10 9 10 8 4 8 7 9 8 4 2 8 2 4 10 1 7 2 2 2
## [26] 1 7 4 9 8 2 1 1 5 4 9 1 1 1 5 8 3 9 9 3 1 1 9 9 1
## [51] 5 1 1 2 1 2 4 1 1 1 2 2 5 10 5 1 2 9 9 9 1 2 2 9 2
## [76] 10 2 2 2 1 5 1 2 5 9 10 2 4 3 8 9 9 3 7 9 2 8 9 7 1
## [101] 7 9 10 2 7 7 2 5 9 2 10 2 2 1 2 5 2 2 9 1 4 7 2 2 5
## [126] 2 9 3 1 4 5 5 3 9 2 7 2 5 10 1 6 7 9 6 1 6 1 5 5 10
## [151] 1 7 2 2 5 2 9 1 5 10 1 2 5 10 2 2 10 2 2 2 2 10 1 9 1
## [176] 1 2 2 10 1 5 5 2 9 7 2 4 3 5 1 3 2 1 2 9 6 9 2 1 1
## [201] 7 4 6 7 1 3 9 7 2 4 8 4 6 10 10 2 1 2 2 7 5 5 1 9 5
## [226] 10 5 10 2 7 5 2 1 10 2 5 2 2 1 9 2 2 10 1 2 1 1 2 9 10
## [251] 5 2 9 2 5 1 2 2 5 5 9 1 1 5 1 2 1 1 7 1 3 1 3 2 5
## [276] 1 1 1 2 9 3 1 7 2 4 5 3 5 2 7 8 1 7 6 9 1 7 1 1 2
## [301] 1 5 5 7 5 2 7 2 9 5 1 1 4 10 10 2 5 1 5 1 1 1 1 7 6
## [326] 2 3 9 3 2 2 2 5 6 7 4 7 9 3 9 1 5 5 5 3 2 7 1 8 3
## [351] 7 1 2 8 10 7 7 7 2 7 5 1 1 4 6 8 7 9 7 1 2 7 5 1 1
## [376] 2 2 1 2 1 5 1 1 9 5 2 1 1 1 2 1 1 1 1 1 1 9 2 10 2
## [401] 1 5 5 1 1 1 5 4 2 1 3 9 1 3 5 5 6 6 3 2 7 1 3 2 7
## [426] 5 3 7 1 1 2 4 1 3 9 2 6 1 9 1 8 4 6 8 1 7 1 4 6 1
## [451] 1 5 7 5 1 7 4 7 7 5 10 7 5 10 1 7 2 6 9 2 5 5 2 1 5
## [476] 5 5 5 5 2 1 1 1 1 9 1 9 2 2 1 2 1 2 1 2 1 2 3 1 7
## [501] 7 7 1 5 3 1 1 1 6 1 1 1 7 5 1 4 7 1 7 6 4 7 1 1 3
## [526] 6 2 8 1 1 5 2 1 1 1 1 7 4 4 7 1 5 9 6 1 7 2 4 7 7
## [551] 1 1 2 1 9 5 5 5 1 1 5 5 5 1 5 2 7 9 5 1 10 2
# Add a variable (kclust_3) to the original dataset
# containing the cluster assignments
imdb_clean <- imdb_clean %>%
mutate(kclust_10 = factor(kclust_k10$cluster))
# Visualize the cluster assignments on the original scatterplot
imdb_clean %>%
ggplot(aes(x = Budget, y = Runtime, color = kclust_10)) +
geom_point() + theme_classic()
# Data-specific function to cluster and calculate total within-cluster SS
imdb_cluster_ss <- function(k){
# Perform clustering
kclust <- kmeans(scale(imdb_sub), centers = k)
# Return the total within-cluster sum of squares
return(kclust$tot.withinss)
}
tibble(
k = 1:15,
tot_wc_ss = purrr::map_dbl(1:15, imdb_cluster_ss)
) %>%
ggplot(aes(x = k, y = tot_wc_ss)) +
geom_point() +
labs(x = "Number of clusters",y = 'Total within-cluster sum of squares') +
theme_classic()
kclust_k8 <- kmeans(scale(imdb_sub), centers = 8)
# Display the cluter assignments
kclust_k8$cluster
## [1] 5 2 8 2 1 2 5 2 8 7 6 5 5 6 7 3 6 3 6 2 4 3 3 3 3 4 3 6 5 6 3 4 4 1 6 5 4
## [38] 4 4 1 6 8 5 5 8 4 4 5 5 4 1 4 4 3 4 3 6 4 4 4 3 3 1 6 1 4 3 5 5 5 4 3 3 5
## [75] 3 2 3 3 3 4 1 4 3 1 5 2 3 6 8 6 5 5 8 7 5 3 8 5 7 4 3 5 2 3 7 3 3 1 2 3 2
## [112] 3 4 4 3 1 4 3 5 4 5 5 3 3 1 3 5 8 4 6 1 1 8 5 3 3 3 1 2 4 7 5 5 7 7 7 4 1
## [149] 1 2 4 7 3 3 1 3 5 4 1 2 4 3 1 2 3 3 2 3 3 3 3 2 4 5 4 4 3 3 2 4 1 1 3 5 3
## [186] 4 6 8 1 4 8 3 4 4 5 7 5 3 4 4 7 6 7 3 4 8 5 3 3 6 6 7 7 2 2 3 4 3 3 3 1 1
## [223] 4 5 1 2 1 2 3 3 1 3 4 2 3 1 3 3 4 5 3 4 2 4 3 4 4 3 2 2 1 3 2 3 1 4 4 3 1
## [260] 1 5 4 4 1 4 4 4 4 7 4 8 4 8 3 1 4 4 4 3 5 8 4 5 4 6 1 8 1 3 3 6 4 7 7 5 4
## [297] 5 4 4 3 4 1 1 3 1 3 3 3 2 1 4 4 7 2 2 4 1 4 1 4 4 4 4 3 7 3 8 5 8 3 3 3 1
## [334] 8 3 6 7 5 8 5 4 1 1 1 8 3 3 4 6 8 5 4 3 6 2 3 3 3 3 3 1 4 4 6 7 8 7 5 3 4
## [371] 3 3 1 4 4 3 3 7 3 4 1 4 4 5 1 3 4 4 4 3 4 4 4 4 4 4 5 3 2 3 4 1 1 4 4 4 1
## [408] 7 3 4 8 5 4 8 1 1 7 7 8 3 7 4 8 3 7 1 8 5 4 4 3 6 4 8 5 3 7 4 5 4 6 6 7 6
## [445] 4 3 4 6 7 4 4 1 7 1 4 5 7 3 3 1 2 3 1 2 7 5 3 7 5 3 1 1 3 4 1 1 1 1 1 3 4
## [482] 4 4 4 5 4 5 3 3 4 3 4 3 4 3 4 3 8 4 5 3 3 4 1 8 4 4 4 7 4 4 4 7 1 4 7 3 4
## [519] 7 7 7 7 4 7 8 7 3 6 4 4 1 3 4 4 4 4 7 7 6 5 4 1 5 7 4 5 3 7 3 5 4 4 3 4 5
## [556] 1 1 1 4 4 1 1 1 4 1 3 3 5 1 4 2 4
# Add a variable (kclust_3) to the original dataset
# containing the cluster assignments
imdb_clean <- imdb_clean %>%
mutate(kclust_8 = factor(kclust_k8$cluster))
# Visualize the cluster assignments on the original scatterplot
imdb_clean %>%
ggplot(aes(x = Budget, y = Runtime, color = kclust_8)) +
geom_point() + theme_classic()
Pick either k = 8 or k = 10 –> there are 7 genres with larger n and 13 total genres. Film noir is fancy way of saying crime drama (ex. Knives Out). One thing to note is a significant number of movies have Genre_2 listed as Family Mystery and Horror (and many have one of the main genres listed as genre 2 as well). What is a good way to summarize both Genre 1 and 2?
imdb_clean %>%
count(Genre_1)
imdb_clean %>%
count(Genre_2)
imdb_clean %>%
group_by(kclust_10) %>%
summarize(across(c(Gross, Budget), mean))
imdb_clean %>%
group_by(kclust_8) %>%
count(Genre_1)
imdb_clean %>%
group_by(kclust_8) %>%
count(Genre_2)
imdb_clean%>%
count(kclust_8)
# Random subsample of 50 penguins
set.seed(253)
imdb_hc <- imdb_clean %>%
slice_sample(n = 25)
# Select the variables to be used in clustering
imdb_hc_sub <- imdb_hc %>%
select(Gross, Budget)
imdb_hc_full <- imdb_clean %>%
select(Gross, Budget)
# Summary statistics for the variables
summary(imdb_hc_sub)
## Gross Budget
## Min. :13.82 Min. : 0
## 1st Qu.:15.98 1st Qu.: 3000000
## Median :17.54 Median : 18500000
## Mean :17.34 Mean : 39318800
## 3rd Qu.:18.88 3rd Qu.: 60000000
## Max. :20.31 Max. :190000000
# Compute a distance matrix on the scaled data
dist_mat_scaled <- dist(scale(imdb_hc_sub))
dist_mat_full <- dist(scale(imdb_hc_full))
imdb_hc_avg <- hclust(dist_mat_scaled, method = "average")
imdb_full_avg <- hclust(dist_mat_full, method = "average")
# Plot dendrogram
plot(imdb_hc_avg)
plot(imdb_hc_avg, labels = imdb_hc$Genre_1)
plot(imdb_hc_avg, labels = paste(imdb_hc$Genre_1, imdb_hc$Genre_2))
imdb_clean <- imdb_clean %>%
mutate(
hclust_num = factor(cutree(imdb_full_avg, k = 3)) # Cut into 6 clusters (k)
)
ggplot(imdb_clean, aes(x = hclust_num, fill = Genre_1)) +
geom_bar(position = "fill") +
labs(x = "Cluster") +
theme_classic()